if (rstudioapi::isAvailable()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')## [conflicted] Will prefer dplyr::intersect over any other package
cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')## Rows: 1560 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (20): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cqmake count table
cont_table = cq %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group, n_db) %>%
unique() %>%
# only keep high abundant circRNAs
filter(count_group == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
# change nr of databases to binary
mutate(n_db = ifelse(is.na(n_db), 0, 1)) %>%
group_by(n_db) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-n_db)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("novel", "in_db")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## novel 16.23933 132.7607
## in_db 80.76067 660.2393
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 162.61, df = 1, p-value < 2.2e-16
OR
OR = (705/36)/(88/61)
OR## [1] 13.57481
make count table
cont_table = cq %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group, n_detected) %>%
unique() %>%
filter(count_group == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
# change nr of databases to binary
mutate(n_detected = ifelse(n_detected == 1, 0, 1)) %>%
group_by(n_detected) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-n_detected)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("unique", "multiple_tools")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## unique 7.738202 63.2618
## multiple_tools 89.261798 729.7382
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 330.06, df = 1, p-value < 2.2e-16
OR
OR = (776/43)/(17/54)
OR## [1] 57.32421
make count table
cont_table = cq %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group, nr_exons) %>%
unique() %>%
filter(count_group == 'count ≥ 5') %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
# also remove the ones that do not have a exon annotation
filter(!is.na(nr_exons), !nr_exons == "ambiguous") %>%
mutate(exon_bin = ifelse(nr_exons == 1, 0, 1)) %>%
group_by(exon_bin) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
mutate(fail = ifelse(is.na(fail), 0, fail)) %>%
select(-exon_bin)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("single_exon", "multi_exons")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## single_exon 7.69018 131.3098
## multi_exons 32.30982 551.6902
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 16.399, df = 1, p-value = 5.13e-05
OR
OR = (562/22)/(121/18)
OR## [1] 3.80015
cq_start = cq %>%
mutate(start_exon_nr = substr(start_match, 19, 27),
start_exon_nr = ifelse(substr(start_exon_nr, 1, 1) == '_',
substr(start_exon_nr, 2, 10),
start_exon_nr))
cq_start cont_table = cq_start %>%
filter(count_group == 'count ≥ 5') %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, start_exon_nr) %>%
unique() %>%
filter(!is.na(start_exon_nr)) %>%
mutate(exon_1 = ifelse(start_exon_nr == "exon_1", 1, 0)) %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(exon_1) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup()
cont_table=> not enough values
cont_table = cq %>%
filter(count_group == 'count ≥ 5',
!strand == 'unknown') %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ss_motif) %>%
unique() %>%
mutate(ss_can = ifelse(ss_motif == "AGNGT", 1, 0)) %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(ss_can) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-ss_can)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("non_cannonical", "cannonical")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## non_cannonical 11.99169 99.00831
## cannonical 66.00831 544.99169
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 42.044, df = 1, p-value = 8.923e-11
OR
OR = (565/46)/(79/32)
OR## [1] 4.975234
cont_table = cq %>%
filter(count_group == "count ≥ 5") %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ENST) %>%
unique() %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail'),
ENST_group = ifelse(is.na(ENST), 'no_match', "match")) %>%
# change nr of databases to binary
group_by(ENST_group) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-ENST_group)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("match", "no_match")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## match 87.637486 717.36251
## no_match 9.362514 76.63749
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 203.2, df = 1, p-value < 2.2e-16
OR
OR = (757/48)/(37/49)
OR## [1] 20.8857
cont_table = cq %>%
filter(count_group == "count ≥ 5") %>%
left_join(read_tsv('../data/details/tool_annotation.txt')) %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, approach) %>%
filter(!approach == 'integrative') %>%
unique() %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(approach) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-approach)## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "tool"
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("candidate-based", "segmented read-based")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## candidate-based 21.54762 159.4524
## segmented read-based 73.45238 543.5476
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 6.8785, df = 1, p-value = 0.008724
OR
OR = (170/11)/(533/84)
OR## [1] 2.435613
cont_table = cq %>%
filter(!count_group == "no_counts") %>%
select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group) %>%
unique() %>%
# to use all val together
filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass',
'pass', 'fail')) %>%
# change nr of databases to binary
group_by(count_group) %>%
count(all_val) %>%
pivot_wider(values_from = n, names_from = all_val) %>%
ungroup() %>%
select(-count_group)
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")
cont_tableplot data
mosaicplot(cont_table,
main = "Mosaic plot",
color = TRUE)if chisquare expected < 5 => then Fisher should be used
chisq.test(cont_table)$expected## fail pass
## count < 5 19.83877 132.1612
## count ≥ 5 116.16123 773.8388
chisquare test
chisq.test(cont_table)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cont_table
## X-squared = 23.636, df = 1, p-value = 1.164e-06
OR
OR = (793/97)/(113/39)
OR## [1] 2.821549
tool_anno = read_tsv('../data/details/tool_annotation.txt')## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val = read_tsv('../data/Supplementary_Table_4_precision_values.txt')## Rows: 29 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (14): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
add annotation to sensitivity
sens_anno = val %>%
group_by(tool) %>%
slice(1) %>%
select(tool, sensitivity) %>%
left_join(tool_anno) %>%
select(-tool_lt) %>% ungroup()## Joining, by = "tool"
sens_annowilcox.test(sensitivity ~ approach, data=sens_anno %>% filter(!approach == 'integrative')) ##
## Wilcoxon rank sum exact test
##
## data: sensitivity by approach
## W = 6, p-value = 0.1264
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ approach, data=sens_anno)##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by approach
## Kruskal-Wallis chi-squared = 3.3209, df = 2, p-value = 0.1901
wilcox.test(sensitivity ~ lin_annotation, data=sens_anno)##
## Wilcoxon rank sum exact test
##
## data: sensitivity by lin_annotation
## W = 22, p-value = 0.8
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ strand_anno, data=sens_anno %>%
filter(!strand_anno == "no strand reported"))##
## Kruskal-Wallis rank sum test
##
## data: sensitivity by strand_anno
## Kruskal-Wallis chi-squared = 5.2538, df = 3, p-value = 0.1541
wilcox_ss = wilcox.test(sensitivity ~ splicing, data=sens_anno)
wilcox_ss##
## Wilcoxon rank sum exact test
##
## data: sensitivity by splicing
## W = 55, p-value = 0.0004579
## alternative hypothesis: true location shift is not equal to 0
sens_anno %>%
group_by(splicing) %>%
summarise(mean_sens = mean(sensitivity))N = nrow(sens_anno)
N## [1] 16
Z = qnorm(wilcox_ss$p.value/2)
Z## [1] -3.504262
r = abs(Z)/sqrt(N)
r## [1] 0.8760654
library(rstatix)
library(ggpubr)
sens_anno %>% rstatix::wilcox_test(sensitivity ~ splicing)sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing)#sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing, ci = T)median_diff = tibble()
for (tool_can in sens_anno %>% filter(splicing == "canonical") %>% pull(tool)) {
sens_can = sens_anno %>% filter(tool == tool_can) %>% pull(sensitivity)
for (tool_non_can in sens_anno %>% filter(splicing == "non-canonical") %>% pull(tool)) {
sens_non_can = sens_anno %>% filter(tool == tool_non_can) %>% pull(sensitivity)
median_diff = median_diff %>%
bind_rows(tibble(tool_can, sens_can, tool_non_can, sens_non_can))
}
}
median_diff = median_diff %>%
mutate(sens_diff = sens_can - sens_non_can)
median_diffmedian_diff %>% pull(sens_diff) %>% median()## [1] 0.384535
wilcox.test(sensitivity ~ BSJ_filter, data=sens_anno) ##
## Wilcoxon rank sum exact test
##
## data: sensitivity by BSJ_filter
## W = 35, p-value = 0.4409
## alternative hypothesis: true location shift is not equal to 0